{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CountFrequencyEncoder\n",
    "<p>The CountFrequencyEncoder() replaces categories by the count of\n",
    "observations per category or by the percentage of observations per category.<br>\n",
    "For example in the variable colour, if 10 observations are blue, blue will\n",
    "be replaced by 10. Alternatively, if 10% of the observations are blue, blue\n",
    "will be replaced by 0.1.</p>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from feature_engine.encoding import CountFrequencyEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load titanic dataset from OpenML\n",
    "\n",
    "def load_titanic():\n",
    "    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')\n",
    "    data = data.replace('?', np.nan)\n",
    "    data['cabin'] = data['cabin'].astype(str).str[0]\n",
    "    data['pclass'] = data['pclass'].astype('O')\n",
    "    data['age'] = data['age'].astype('float')\n",
    "    data['fare'] = data['fare'].astype('float')\n",
    "    data['embarked'].fillna('C', inplace=True)\n",
    "    data.drop(labels=['boat', 'body', 'home.dest'], axis=1, inplace=True)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>survived</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>ticket</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Allen, Miss. Elisabeth Walton</td>\n",
       "      <td>female</td>\n",
       "      <td>29.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>24160</td>\n",
       "      <td>211.3375</td>\n",
       "      <td>B</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Allison, Master. Hudson Trevor</td>\n",
       "      <td>male</td>\n",
       "      <td>0.9167</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Miss. Helen Loraine</td>\n",
       "      <td>female</td>\n",
       "      <td>2.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mr. Hudson Joshua Creighton</td>\n",
       "      <td>male</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mrs. Hudson J C (Bessie Waldo Daniels)</td>\n",
       "      <td>female</td>\n",
       "      <td>25.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pclass  survived                                             name     sex  \\\n",
       "0      1         1                    Allen, Miss. Elisabeth Walton  female   \n",
       "1      1         1                   Allison, Master. Hudson Trevor    male   \n",
       "2      1         0                     Allison, Miss. Helen Loraine  female   \n",
       "3      1         0             Allison, Mr. Hudson Joshua Creighton    male   \n",
       "4      1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   \n",
       "\n",
       "       age  sibsp  parch  ticket      fare cabin embarked  \n",
       "0  29.0000      0      0   24160  211.3375     B        S  \n",
       "1   0.9167      1      2  113781  151.5500     C        S  \n",
       "2   2.0000      1      2  113781  151.5500     C        S  \n",
       "3  30.0000      1      2  113781  151.5500     C        S  \n",
       "4  25.0000      1      2  113781  151.5500     C        S  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = load_titanic()\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data.drop(['survived', 'name', 'ticket'], axis=1)\n",
    "y = data.survived"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cabin       0\n",
       "pclass      0\n",
       "embarked    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we will encode the below variables, they have no missing values\n",
    "X[['cabin', 'pclass', 'embarked']].isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cabin       object\n",
       "pclass      object\n",
       "embarked    object\n",
       "dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "''' Make sure that the variables are type (object).\n",
    "if not, cast it as object , otherwise the transformer will either send an error (if we pass it as argument) \n",
    "or not pick it up (if we leave variables=None). '''\n",
    "\n",
    "X[['cabin', 'pclass', 'embarked']].dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((916, 8), (393, 8))"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's separate into training and testing set\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "\n",
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The CountFrequencyEncoder(), replaces the categories by the count or frequency of the observations in the train set for that category. \n",
    "\n",
    "If we select \"count\" in the encoding_method, then for the variable colour, if there are 10 observations in the train set that show colour blue, blue will be replaced by 10.<br><br> Alternatively, if we select \"frequency\" in the encoding_method, if 10% of the observations in the train set show blue colour, then blue will be replaced by 0.1."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Frequency\n",
    "\n",
    "Labels are replaced by the percentage of the observations that show that label in the train set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountFrequencyEncoder(encoding_method='frequency',\n",
       "                      variables=['cabin', 'pclass', 'embarked'])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "Parameters\n",
    "----------\n",
    "\n",
    "encoding_method : str, default='count' \n",
    "                Desired method of encoding.\n",
    "\n",
    "        'count': number of observations per category\n",
    "        \n",
    "        'frequency': percentage of observations per category\n",
    "\n",
    "variables : list\n",
    "          The list of categorical variables that will be encoded. If None, the \n",
    "          encoder will find and transform all object type variables.\n",
    "'''\n",
    "count_encoder = CountFrequencyEncoder(encoding_method='frequency',\n",
    "                                      variables=['cabin', 'pclass', 'embarked'])\n",
    "\n",
    "count_encoder.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cabin': {'n': 0.7663755458515283,\n",
       "  'C': 0.07751091703056769,\n",
       "  'B': 0.04585152838427948,\n",
       "  'D': 0.034934497816593885,\n",
       "  'E': 0.034934497816593885,\n",
       "  'A': 0.018558951965065504,\n",
       "  'F': 0.016375545851528384,\n",
       "  'G': 0.004366812227074236,\n",
       "  'T': 0.001091703056768559},\n",
       " 'pclass': {3: 0.5436681222707423,\n",
       "  1: 0.25109170305676853,\n",
       "  2: 0.2052401746724891},\n",
       " 'embarked': {'S': 0.7117903930131004,\n",
       "  'C': 0.19759825327510916,\n",
       "  'Q': 0.0906113537117904}}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can explore the encoder_dict_ to find out the category replacements.\n",
    "count_encoder.encoder_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1139</th>\n",
       "      <td>0.543668</td>\n",
       "      <td>male</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8958</td>\n",
       "      <td>0.766376</td>\n",
       "      <td>0.71179</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>533</th>\n",
       "      <td>0.205240</td>\n",
       "      <td>female</td>\n",
       "      <td>21.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>21.0000</td>\n",
       "      <td>0.766376</td>\n",
       "      <td>0.71179</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>459</th>\n",
       "      <td>0.205240</td>\n",
       "      <td>male</td>\n",
       "      <td>42.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>27.0000</td>\n",
       "      <td>0.766376</td>\n",
       "      <td>0.71179</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1150</th>\n",
       "      <td>0.543668</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14.5000</td>\n",
       "      <td>0.766376</td>\n",
       "      <td>0.71179</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>393</th>\n",
       "      <td>0.205240</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>31.5000</td>\n",
       "      <td>0.766376</td>\n",
       "      <td>0.71179</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        pclass     sex   age  sibsp  parch     fare     cabin  embarked\n",
       "1139  0.543668    male  38.0      0      0   7.8958  0.766376   0.71179\n",
       "533   0.205240  female  21.0      0      1  21.0000  0.766376   0.71179\n",
       "459   0.205240    male  42.0      1      0  27.0000  0.766376   0.71179\n",
       "1150  0.543668    male   NaN      0      0  14.5000  0.766376   0.71179\n",
       "393   0.205240    male  25.0      0      0  31.5000  0.766376   0.71179"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transform the data: see the change in the head view\n",
    "train_t = count_encoder.transform(X_train)\n",
    "test_t = count_encoder.transform(X_test)\n",
    "test_t.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAFkCAYAAADbgnvLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deZRdZZ3u8e/D6IAymGKQQAcwKKAY6BJ00SoYBwZl0AvC7aZpRaP3ilca2yva9MUJRRu0BwUMDQ0qMmNDKyI00iC9xKSAEAkRmQIEIhSIgsKyCXnuH3sfclJWpU7V2VW7ap/ns1at2ufd51T9krP2U/u8+93vK9tERESzrFN3ARERUb2Ee0REAyXcIyIaKOEeEdFACfeIiAZar+4CAGbMmOFZs2bVXUZExLRy8803P2a7b7h9UyLcZ82axcDAQN1lRERMK5LuH2lfumUiIhoo4R4R0UAJ94iIBkq4R0Q0UMI9IqKBEu4REQ2UcI+IaKCEe0REAyXcIyIaaErcoTrZZh3/g7pLmFDLTj6g7hIiomY5c4+IaKCEe0REAyXcIyIaKOEeEdFACfeIiAZKuEdENNCo4S5pG0nXSVoqaYmkj5Xtm0m6RtJd5fdN217zKUl3S7pT0jsm8h8QERF/rJMz95XAx23vBLwe+IiknYHjgWttzwauLR9T7jsc2AXYFzhN0roTUXxERAxv1HC3vcL2LeX2U8BSYGvgIODc8mnnAgeX2wcBF9j+g+37gLuBPaouPCIiRjamPndJs4DdgJ8BW9heAcUfAGDz8mlbAw+2vWx52RYREZOk43CXtBFwKXCs7SfX9tRh2jzMz5snaUDSwODgYKdlREREBzoKd0nrUwT7ebYvK5sfkbRVuX8r4NGyfTmwTdvLZwIPD/2Ztufb7rfd39fXN976IyJiGJ2MlhFwFrDU9lfbdl0BHFVuHwVc3tZ+uKQNJW0HzAYWVFdyRESMppNZIfcCjgR+LmlR2fZp4GTgIklHAw8AhwLYXiLpIuAOipE2H7H9XOWVR0TEiEYNd9s3Mnw/OsDcEV5zEnBSF3VFREQXcodqREQDJdwjIhoo4R4R0UAJ94iIBkq4R0Q0UMI9IqKBEu4REQ2UcI+IaKCEe0REAyXcIyIaKOEeEdFACfeIiAZKuEdENFDCPSKigRLuERENlHCPiGigTpbZO1vSo5Jub2u7UNKi8mtZa4UmSbMkPdO274yJLD4iIobXyTJ75wBfB77VarD93ta2pFOB37Y9/x7bc6oqMCIixq6TZfZukDRruH3l4tmHAW+ptqyIiOhGt33ubwQesX1XW9t2km6VdL2kN470QknzJA1IGhgcHOyyjIiIaNdtuB8BnN/2eAWwre3dgOOA70p66XAvtD3fdr/t/r6+vi7LiIiIduMOd0nrAe8GLmy12f6D7cfL7ZuBe4Aduy0yIiLGppsz97cCv7C9vNUgqU/SuuX29sBs4N7uSoyIiLHqZCjk+cBPgVdKWi7p6HLX4azZJQPwJmCxpNuAS4AP2/51lQVHRMToOhktc8QI7X81TNulwKXdlxUREd3IHaoREQ2UcI+IaKCEe0REAyXcIyIaKOEeEdFACfeIiAZKuEdENFDCPSKigRLuERENlHCPiGighHtERAMl3CMiGijhHhHRQAn3iIgGSrhHRDRQJ4t1nC3pUUm3t7V9RtJDkhaVX/u37fuUpLsl3SnpHRNVeEREjKyTM/dzgH2Haf+a7Tnl15UAknamWKFpl/I1p7WW3YuIiMkzarjbvgHodKm8g4ALyoWy7wPuBvboor6IiBiHbvrcj5G0uOy22bRs2xp4sO05y8u2iIiYROMN99OBHYA5wArg1LJdwzzXw/0ASfMkDUgaGBwcHGcZERExnHGFu+1HbD9nexVwJqu7XpYD27Q9dSbw8Ag/Y77tftv9fX194ykjIiJGMK5wl7RV28NDgNZImiuAwyVtKGk7YDawoLsSIyJirNYb7QmSzgf2BmZIWg6cCOwtaQ5Fl8sy4EMAtpdIugi4A1gJfMT2cxNTekREjGTUcLd9xDDNZ63l+ScBJ3VTVEREdCd3qEZENFDCPSKigRLuERENlHCPiGighHtERAMl3CMiGijhHhHRQAn3iIgGSrhHRDRQwj0iooES7hERDZRwj4hooIR7REQDJdwjIhoo4R4R0UAJ94iIBho13CWdLelRSbe3tf29pF9IWizpe5I2KdtnSXpG0qLy64yJLD4iIobXyZn7OcC+Q9quAV5te1fgl8Cn2vbdY3tO+fXhasqMiIixGDXcbd8A/HpI29W2V5YPbwJmTkBtERExTlX0ub8f+GHb4+0k3SrpeklvHOlFkuZJGpA0MDg4WEEZERHR0lW4S/pbYCVwXtm0AtjW9m7AccB3Jb10uNfanm+733Z/X19fN2VERMQQ4w53SUcB7wT+3LYBbP/B9uPl9s3APcCOVRQaERGdG1e4S9oX+CRwoO2n29r7JK1bbm8PzAburaLQiIjo3HqjPUHS+cDewAxJy4ETKUbHbAhcIwngpnJkzJuAz0laCTwHfNj2r4f9wRERMWFGDXfbRwzTfNYIz70UuLTboiIioju5QzUiooES7hERDZRwj4hooIR7REQDJdwjIhoo4R4R0UAJ94iIBkq4R0Q0UMI9IqKBEu4REQ2UcI+IaKCEe0REAyXcIyIaKOEeEdFACfeIiAYaNdwlnS3pUUm3t7VtJukaSXeV3zdt2/cpSXdLulPSOyaq8IiIGFknZ+7nAPsOaTseuNb2bODa8jGSdgYOB3YpX3Naa9m9iIiYPKOGu+0bgKFL5R0EnFtunwsc3NZ+QblQ9n3A3cAeFdUaEREdGm+f+xa2VwCU3zcv27cGHmx73vKyLSIiJlHVF1Q1TJuHfaI0T9KApIHBwcGKy4iI6G3jDfdHJG0FUH5/tGxfDmzT9ryZwMPD/QDb82332+7v6+sbZxkRETGc8Yb7FcBR5fZRwOVt7YdL2lDSdsBsYEF3JUZExFitN9oTJJ0P7A3MkLQcOBE4GbhI0tHAA8ChALaXSLoIuANYCXzE9nMTVHtERIxg1HC3fcQIu+aO8PyTgJO6KSoiIrqTO1QjIhoo4R4R0UCjdstETDWzjv9B3SVMqGUnH1B3CdEAOXOPiGighHtERAOlWyYiJk261CZPztwjIhoo4R4R0UAJ94iIBkq4R0Q0UMI9IqKBEu4REQ2UcI+IaKCEe0REAyXcIyIaKOEeEdFA455+QNIrgQvbmrYH/h+wCfBBoLXq9adtXznuCiMiYszGHe627wTmAEhaF3gI+B7wPuBrtk+ppMKIiBizqrpl5gL32L6/op8XERFdqCrcDwfOb3t8jKTFks6WtOlwL5A0T9KApIHBwcHhnhIREePUdbhL2gA4ELi4bDod2IGiy2YFcOpwr7M933a/7f6+vr5uy4iIiDZVnLnvB9xi+xEA24/Yfs72KuBMYI8KfkdERIxBFeF+BG1dMpK2att3CHB7Bb8jIiLGoKuVmCS9CHgb8KG25q9ImgMYWDZkX0RETIKuwt3208DLhrQd2VVFERHRtdyhGhHRQAn3iIgGSrhHRDRQwj0iooES7hERDZRwj4hooIR7REQDJdwjIhoo4R4R0UAJ94iIBkq4R0Q0UMI9IqKBEu4REQ2UcI+IaKCEe0REAyXcIyIaqNuVmJYBTwHPAStt90vaDLgQmEWxEtNhtp/orsyIiBiLKs7c97E9x3Z/+fh44Frbs4Fry8cRETGJJqJb5iDg3HL7XODgCfgdERGxFt2Gu4GrJd0saV7ZtoXtFQDl982He6GkeZIGJA0MDg52WUZERLTrqs8d2Mv2w5I2B66R9ItOX2h7PjAfoL+/313WERERbbo6c7f9cPn9UeB7wB7AI5K2Aii/P9ptkRERMTbjDndJL5b0ktY28HbgduAK4KjyaUcBl3dbZEREjE033TJbAN+T1Po537V9laSFwEWSjgYeAA7tvsyIiBiLcYe77XuB1w7T/jgwt5uiIiKiO7lDNSKigRLuERENlHCPiGighHtERAMl3CMiGijhHhHRQAn3iIgGSrhHRDRQwj0iooES7hERDZRwj4hooIR7REQDJdwjIhoo4R4R0UAJ94iIBupmJaZtJF0naamkJZI+VrZ/RtJDkhaVX/tXV25ERHSim5WYVgIft31LudzezZKuKfd9zfYp3ZcXERHj0c1KTCuAFeX2U5KWAltXVVhERIxfJX3ukmYBuwE/K5uOkbRY0tmSNh3hNfMkDUgaGBwcrKKMiIgodR3ukjYCLgWOtf0kcDqwAzCH4sz+1OFeZ3u+7X7b/X19fd2WERERbboKd0nrUwT7ebYvA7D9iO3nbK8CzgT26L7MiIgYi25Gywg4C1hq+6tt7Vu1Pe0Q4PbxlxcREePRzWiZvYAjgZ9LWlS2fRo4QtIcwMAy4ENdVRgREWPWzWiZGwENs+vK8ZcTERFVyB2qERENlHCPiGighHtERAMl3CMiGijhHhHRQAn3iIgGSrhHRDRQwj0iooES7hERDZRwj4hooIR7REQDJdwjIhoo4R4R0UAJ94iIBkq4R0Q0UMI9IqKBJizcJe0r6U5Jd0s6fqJ+T0RE/LEJCXdJ6wLfAPYDdqZYem/nifhdERHxxybqzH0P4G7b99r+b+AC4KAJ+l0RETFENwtkr83WwINtj5cDe7Y/QdI8YF758HeS7pygWqaCGcBjk/XL9OXJ+k09I+/f9NX09+5PRtoxUeE+3MLZXuOBPR+YP0G/f0qRNGC7v+46Ynzy/k1fvfzeTVS3zHJgm7bHM4GHJ+h3RUTEEBMV7guB2ZK2k7QBcDhwxQT9roiIGGJCumVsr5R0DPAjYF3gbNtLJuJ3TRM90f3UYHn/pq+efe9ke/RnRUTEtJI7VCMiGijhHhHRQAn3iIgGSrhHRDRQwr1ikl4k6f9K+oSkF0j6K0lXSPqKpI3qri/GRtJmddcQ3ZPUc6NmEu7VOwfYAtgO+AHQD5xCcdfu6fWVFaORtJekpZKWSNpT0jXAgKQHJb2h7vpi7SRtNsLXy4D9665vsmUoZMUkLbI9R5KAFcBWtl0+vs32rjWXGCOQtAA4GtgI+HfgYNs3Stod+Gfbe9VaYKyVpOeA+1lz+hOXj7e2vUEthdVkouaW6XlloF/p8q9n+Th/Sae29W3/HEDSoO0bAWzfIumF9ZYWHbgXmGv7gaE7JD04zPMbLd0y1Rto9a3bfn+rUdIOwFO1VRWdaD8ePjVkX0+d9U1T/wBsOsK+r0xmIVNBumUmkSQ5/+FTlqQDgf+w/fSQ9h2A99juuYCI6SvhPgEkvRTos33PkPZdbS+uqayIxis/Ne9LMSvtSuAu4Grbq2otrAbplqmYpMOAXwCXlqMuXte2+5x6qopOSNq1bXt9SSeUw1i/KOlFddYWoyuPvesowv0YihXhjgQWtb+3vSLhXr1PA39qew7wPuDbkt5d7htuEZOYOs5p2z4ZeAVwKvBC4Iw6CooxOQF4s+0PUKz8trntPwf+gh58/zJapnrr2l4BYHuBpH2A70uayZDVqGLKaf/jOxd4ne1nJd0A3FZTTdE5Ac+U278HNgewvbjsKu0pCffqPSVph1Z/u+0VZcB/D9il3tJiFBtLOoTiE+2Gtp+FDGOdRq4ErpJ0PbAfcDE8f5dxz31qzgXVikl6LfB723cPaV8fOMz2efVUFqOR9K9Dmo63/YikLYHzbM+to67onKT9gZ0pbhi8pmxbh+Iehj/UWtwkS7hPAkm7276l7joieo2kGbYfq7uOOiTcK1beqr5GE3A58C6K/++E/BQm6U3AI7bvlPRnwOuBpbZ/UHNpMQpJ+wGnAQ8BHwW+A7wA2BA4yva1NZY36RLuFZO0CrgJaP8I+PqyzbbfUkthMSpJ/0AxfG49ivV/5wI/BN4M3Gr7EzWWF6OQtAg4AtgE+D5wgO2bJO1E0a029MSr0RLuFZP0PyjOGr5s+8qy7T7b29VbWYxG0hLg1RRDHx+imGzq6fJ6ya22X11rgbFWkm5pBbikB21v07ZvUTk8uWdknHvFbF8CHAC8TdLFkrYlQyCnC5fTQ7TuZmy9b6vIsTId/EbShyR9AnhC0l9L2lrSUcDv6i5usmUo5ASw/TvgryXNAc6lmEI2pr4fSPoJRT/tvwAXSbqJolvmhlori04cRXEj0yrg7RRdND+imAb4gzXWVYt0y0ywch73l9h+su5aYnTlohwu+2p3AA4BHgAu6cX5SWL6SrhXTNJXgUtt/1fdtUR3yrsaZwP32n6i7npi7CT9uFcHMSTcKyZpkOJjYB9wIXC+7VvrrSo6Iek7wLG2H5P0DoqumTspAv5vbF9ca4GxVpKGzrgqYEeK95BeWwUtfe7VW267X9Js4HDgO5LWBc6nCPpf1lterMVr2254ORF4o+1lkmYA11Lezh5T1jLgSeALFHPMCPgJxT0mPScjAKrXWlbvLtuft70LcBjFRbora60sRrNO2wRTqyj62ikDPydCU5ztA4FLgfkUf6iXAc/avt/2/bUWV4N0y1RM0q22d6u7jhi7cj7wTwLfAF5JMeXv5cBbgMdtf7zG8qJDkl4MfJ7i/dvd9syaS6pFwr1ikjYqh0LGNCTpFRTD5nakOFtfDvyb7R/VWliMWTmJ3xts99xc7pBwnxDl8Mc9gK0pumkeBhZk/dSIiSdp/dZ0zW1tPTeBWPrcKybp7RTrNn4G2J/ibtXPAneV+2KKkvRVSXvVXUeMj6R9JC0HHpZ0taRZbbuvrqeq+uTMvWKSlgL7lRdz2tu3A660vVMthcWoMox1epO0EPgr20vKOZ6+BBxZ3pDWc9fCcuZevVY/7VAPAetPci0xNstt9wNvBZ6iGMb6C0knStqx5tpidBvYXgLPz/F0MHBuubpWz53FZnhX9c4GFkq6AHiwbNuGYsz7WbVVFZ14fhgrxWiLz0valWKOkispRl/E1PWspC1t/wqgPIOfSzH97w71ljb50i0zAcr5ow+iuKAqijP5K2zfUWthsVa9+NG9SSS9FRi0fduQ9o2BY2yfVE9l9Ui4R5QyjDWaJH3uFZN0i6QTJG1fdy0xZv9dDmMFnh998fFy+baY4tqOvZ7rghlOwr16m1Is8/WfkhaUCwa8vO6ioiMLKd47ygUfTqJYlek4SV+qs7DoSOvYuy7HXrplKjdkqa83UlyMezewlGJo3fw664uRSbq9tZSepAGKicOekbQecEuvzSo43eTYW1PO3CeQ7Z/Y/t8UF1a/DLyh5pJi7Z6U1Fon9TGKyd6gGFWWY2UaybGXoZAT4Y+m9LX9HHBV+RVT14eB8yTdBjwKDEi6HtgV+GKtlUUncuy1SbdMRJty7v23s+bEYT+y/ZtaC4sYo4T7BJD0KoqPgz9rH1onaV/bPXcGETEZJO0JLLX9pKQXAscDuwN3AF+0/dtaC5xk6UesmKT/QzEH+EeB2yUd1LY7H+2nMEn7tm1vLOksSYslfVfSFnXWFh05G3i63P5HYGOK/vangX+tq6i6pM+9eh8E/tT278pZ6S6RNMv2P1LcrRpT1xdZ3Td7KrCCYom2dwPfpJirJKaudWyvLLf7WyNngBslLaqrqLrkzL1667a6YsqZIfcG9pP0VRLu00m/7RPKJdq+Bsyqu6AY1e2S3ldu3yapH6Cc9O3ZkV/WTAn36v1K0pzWgzLo3wnMAF5TW1XRic0lHSfp48BL2+9WJcfKdPAB4M2S7gF2Bn4q6V7gzHJfT8kF1YpJmgmsbM1MN2TfXrb/q4ayogOSThzSdJrtQUlbAl+x/Zd11BVjI+klwPaUo51sP1JzSbVIuEdEY0k60PYVdddRh1xQrZik11B8DNwa+CHwSdtPlPsW2N6jzvpi7cphrK3pmlvr315he2mthcWoJL17mObTyukjsH3ZJJdUq/QjVu90ivVTX0Nxx9yNbbPUZSWmKUzSJ4ELKC58L6CYSEzA+ZKOr7O26MhFwPsprnG9q/x6cfn9nTXWVYt0y1RM0iLbc9oe7wPMB46k6MPdfcQXR60k/RLYxfazQ9o3AJbYnl1PZdEJSa8DTgYuAc6wbUn32d6u5tJqkTP36qlc+QUA29cB7wG+DfxJbVVFJ1YBw00Ru1W5L6Yw2wuBtwEbAD+WtAc9uHZqS87cKybpfwL32r5pSPu2wN/Z/mA9lcVoyjtUvw7cxer1b7elWDv1mEwdMX2U87j/A8X9Cj25cE7CPaKNpHWAPVhz/duF5eyCEdNGwr1i5ayCHwBmAle1j2uXdILtL9RWXIyZpM1s/7ruOmJ0ki4DLgP+LWvhps99InwTeDPwOPBP5bQDLcMN1YopQtJekpZKWiJpT0nXUMzp/qCknlvsYRrak2L+nwckXSTpkPJieE/KmXvFJC1uLcdWjq89jWLqgSOAm2zvVmd9MTJJC4CjgY2AfwcOtn2jpN2Bf7a9V60FxlpJutX2buUdqgdTHHOvA75Pscze1bUWOMly5l69588UbK+0PQ9YBPyYIjRi6lrf9s9t/xQYtH0jgO1bKBbKjqnNALafsv1t2/sDrwR+RjG3e09JuFdvoH1ecADbn6OYT3pWLRVFp9qPh08N2dezH++nkT/qZ7f9a9tn2H5LHQXVKd0yESVJBwL/YfvpIe07AO+x/ZV6KosYu4T7JJL0NtvX1F1HRK/pxWMv4T6JJD1ge9u664jhlRfAjwYOobhTtTVx2OXAWUOnJYjpoxePvcwKWTFJI00vKuBlk1lLjNm3gd9QTPy2vGybCRwFfAd4bz1lRSdy7K0pZ+4Vk/QE8Bf88cUdARfazkLLU5SkO22/coR9v7S942TXFJ3LsbemnLlX7ybgadvXD90h6c4a6onOPSHpUOBS26vg+ekIDgWeqLWy6ESOvTY5c48oSZoFfBl4C6vDfBPgOuB42/fVU1nE2CXcJ4GkGbYfq7uO6Jykl1EcH3nfpiFJmwFurYLWi3ITU8Uk7SfpPkk3StpN0hLgZ5KWS5pbd30xMkkHStoQwPbjCfbpRdK2ki6QNEhxV+pCSY+WbbPqrW7y5cy9YpIWUcxpsQnFnBYH2L5J0k7AeVmJaeqS9Azwe4q1b88HfpSpfqcPST+lmMP9ktb7Vs7SeihwrO3X11nfZMuZe/VW2V5azk/ydGvRjnKB5fx/T22/AGYDNwAfBx6WdIakN9dbVnRohu0L2/8g237O9gX04FDIjJap3m8kfQh4KcXoi7+mWLj3rQwz90VMKa0+2jOBMyVtCRwGnCxppu1t6i0vRnGzpNOAc1m9ktY2FPcp3FpbVTVJt0zFJG0DnECx5uZnKbpojgbuB/6mPIOPKag1ZewI+/7E9v2TXVN0rpy7/WjgINZcSesKijuM/1BjeZMu4R5RkrS37f+su46IKqQPuGLl6i+bldt9kr4l6eeSLpQ0s+76YmQJ9uaR9OO6a6hLztwrJukO2zuX2xdS3DV3MUWf+5/bflud9cX4SPq57dfUXUeMTNLioU3AjsCdAK0V0npFLqhWb9227VfYbk02dY6kY+soKDojaaQ1bgVsOZm1xLgsA54EvgA8Q/G+/QR4V4011SbhXr3/lPQ54Evl9sG2/03SPsBva64t1u5C4DzK5dqGeMEk1xJjZPtASYcA84FTbF8h6dlevRCebpmKSVof+Fvg/WXTTIobY/6dYn6SB+qqLdZO0s3AUbZvH2bfgxkKOT1IejHweeAVwO62e/JaV8J9AknaGFjP9uN11xKjk/RG4P7h/gBL6rc9UENZMU6SXgu8wfYZdddSh4R7xcqxts+6/I8tu2N2B+6w/cNai4voAZLWH7pqVi9O3pehkNVbSDGvDJI+AZwEvBA4TtKX6iwsRidpH0lfl3S5pEslnSzpFXXXFaMr37vlFNNGXD1ksrCr66mqPgn36q3bNs3oe4G5tr8A7AccUF9ZMRpJJwN/STF89VngXuAe4OJyEY+Y2r4CvMN2H8VF1WsktSYLU31l1SOjZar3pKRXlxflHqMYZfEMxf91/phObQe0xrJLugC43vYnJF1CMaTu4lqri9FsYHsJgO1LJC0FLpN0PMOPgGq0hHv1PgycJ+k24FFgQNL1wK7AF2utLEazStJmtn8NvJzyngXbT0jquTO/aehZSVva/hWA7SXlGgrfB3aot7TJl3CvmO3FknYH3k5xd9xtFJMXHWf7N7UWF6P5InBrud7mq4D/BcU0EhTvY0xtxwNbAL9qNdheLmlv4CN1FVWXjJaJaFPOC7Q9cHf+GMd0lj7giknaUtLpkr4h6WWSPiNpsaSLJG1Vd30xqicoumPeUk4Ct2e6ZKYHSfu2bW8s6azy2PuupC3qrK0OCffqnQPcQbFYwHUUF1PfSXFBridvppguJL0duAv4DLA/xeimzwJ3lftiamu/pnUqsIJiXpmFwDdrqahG6ZapWPuCD5IesL1t275FtufUV12sTTm6Yj/by4a0bwdcaXunWgqLjki6pbVG8dBjrRePvVxQrV77p6FvrWVfTD3rUVz8HuohYP1JriXGbnNJx1GMaX+pJHn12WvPHXsJ9+pdLmkj27+zfUKrsbzL8Zc11hWjOxtYWI5xb1+D83DgrNqqik6dCbyk3D4XmAEMlmvhLqqtqpqkWyaijaSdGGYNTtt31FpYxBgl3CeBpG/Z/su664hoOkmvYvUfZwMPU/xx7rmF6dMtUzFJVwxtAvaRtAkUCwpMflXRLUk/tL1f3XXEyCR9EjgCuABYUDbPBM6XdIHtk2srrgY5c6+YpFsohkL+C8WZg4DzKfptsX19fdXF2pR3Fg+7C/i+7dynMIVJ+iWwyzDT/W4ALLE9u57K6pEz9+r1Ax+jWI3pE7YXSXomoT4tLASuZ/gZBDeZ5Fpi7FZRzAk0dFm9rcp9PSXhXjHbq4CvSbq4/P4I+X+eLpYCH7J919Adkh4c5vkxtRwLXCvpLlaPdtqWYrm9Y2qrqiYJnQliezlwqKR3UqzIHlPfZxh5PPRHJ7GOGAfbV0naEdiDNUc7LbT9XK3F1SB97pOgbRrZiIhJ0XN3bU00Se03Lu1cXuQZkLRM0p41lhZjJOnPJB2XeWWmB0m7SrpJ0oOS5kvatG3fgrW9tokS7tV7d9v23wMfs709cBjwtXpKihqE+XwAAAIySURBVE60B4CkDwJfp7jj8cRyNZ+Y2k6j6Fp7DcXd4DdKai3S0XPTR6TPfWK93PYPAWwvkPTCuguKtWoPgHnA22wPSjqFYl3VnhonPQ1tZPuqcvsUSTcDV0k6kiyzFxXYvryRScBMSS+y/XS5r+fOHqaZdcqP8utQXI8aBLD9e0kr6y0tOiBJG9v+LYDt6yS9B7gU2Kze0iZfwr16Bw15vA5AuVjA6ZNfTozBxsDNFH+Y3VqPU9JGDD/2PaaWLwM7UXzKAp5f9nIu8He1VVWTjJaJGIWkFwFb2L6v7loiOpULqpNI0ry6a4ixs/10gn1668VjL+E+ufLRfgrLULpG67ljL90yEyDTjk5Pkm4EvkDRZ/sB4H3AgbbvaV8+MaauHHur5cy9YuW0oxdQnCksoJiMShTTjmas9NS2ke2rbP/G9ikU85FcJen19OBQuukmx96acuZesUw7On1Jug14U2soXdm2K+VQOtsvq624GFWOvTXlzL16rWlHh+rJaUenmdZQuufZXgzMBS6rpaIYixx7bXLmXjFJ+1Lctj7stKNtd9BFRIVy7K0p4T4BJK1Dph1tFEnzbM+vu45Yuxx7q+UO1QlQLthx06hPjOmk54bSTUc59lbLmXtEmwyli6bIBdWIUobSRZPkzD2ilKF00SQ5c49YLUPpojFyQTVitWOBayUNO5SutqoixiHdMhFtMpQumiLhHhHRQOlzj4hooIR7REQDJdwjIhoo4R4R0UD/H9B4RLNkVRZZAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "test_t['pclass'].value_counts().plot.bar()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1139</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8958</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>533</th>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>21.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>21.0000</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>459</th>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>42.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>27.0000</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1150</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14.5000</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>393</th>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>31.5000</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      pclass     sex   age  sibsp  parch     fare cabin embarked\n",
       "1139       3    male  38.0      0      0   7.8958     n        S\n",
       "533        2  female  21.0      0      1  21.0000     n        S\n",
       "459        2    male  42.0      1      0  27.0000     n        S\n",
       "1150       3    male   NaN      0      0  14.5000     n        S\n",
       "393        2    male  25.0      0      0  31.5000     n        S"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_orig = count_encoder.inverse_transform(test_t)\n",
    "test_orig.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count\n",
    "\n",
    "Labels are replaced by the number of the observations that show that label in the train set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountFrequencyEncoder(variables=['cabin'])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# this time we encode only 1 variable\n",
    "\n",
    "count_enc = CountFrequencyEncoder(encoding_method='count',\n",
    "                                                variables='cabin')\n",
    "\n",
    "count_enc.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cabin': {'n': 702,\n",
       "  'C': 71,\n",
       "  'B': 42,\n",
       "  'D': 32,\n",
       "  'E': 32,\n",
       "  'A': 17,\n",
       "  'F': 15,\n",
       "  'G': 4,\n",
       "  'T': 1}}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can find the mappings in the encoder_dict_ attribute.\n",
    "\n",
    "count_enc.encoder_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1139</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8958</td>\n",
       "      <td>702</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>533</th>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>21.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>21.0000</td>\n",
       "      <td>702</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>459</th>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>42.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>27.0000</td>\n",
       "      <td>702</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1150</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14.5000</td>\n",
       "      <td>702</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>393</th>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>31.5000</td>\n",
       "      <td>702</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     pclass     sex   age  sibsp  parch     fare  cabin embarked\n",
       "1139      3    male  38.0      0      0   7.8958    702        S\n",
       "533       2  female  21.0      0      1  21.0000    702        S\n",
       "459       2    male  42.0      1      0  27.0000    702        S\n",
       "1150      3    male   NaN      0      0  14.5000    702        S\n",
       "393       2    male  25.0      0      0  31.5000    702        S"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transform the data: see the change in the head view for Cabin\n",
    "\n",
    "train_t = count_enc.transform(X_train)\n",
    "test_t = count_enc.transform(X_test)\n",
    "\n",
    "test_t.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Select categorical variables automatically\n",
    "\n",
    "If we don't indicate which variables we want to encode, the encoder will find all categorical variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountFrequencyEncoder(variables=['pclass', 'sex', 'cabin', 'embarked'])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# this time we ommit the argument for variable\n",
    "count_enc = CountFrequencyEncoder(encoding_method = 'count')\n",
    "\n",
    "count_enc.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pclass', 'sex', 'cabin', 'embarked']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can see that the encoder selected automatically all the categorical variables\n",
    "\n",
    "count_enc.variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1139</th>\n",
       "      <td>498</td>\n",
       "      <td>581</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8958</td>\n",
       "      <td>702</td>\n",
       "      <td>652</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>533</th>\n",
       "      <td>188</td>\n",
       "      <td>335</td>\n",
       "      <td>21.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>21.0000</td>\n",
       "      <td>702</td>\n",
       "      <td>652</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>459</th>\n",
       "      <td>188</td>\n",
       "      <td>581</td>\n",
       "      <td>42.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>27.0000</td>\n",
       "      <td>702</td>\n",
       "      <td>652</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1150</th>\n",
       "      <td>498</td>\n",
       "      <td>581</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14.5000</td>\n",
       "      <td>702</td>\n",
       "      <td>652</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>393</th>\n",
       "      <td>188</td>\n",
       "      <td>581</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>31.5000</td>\n",
       "      <td>702</td>\n",
       "      <td>652</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      pclass  sex   age  sibsp  parch     fare  cabin  embarked\n",
       "1139     498  581  38.0      0      0   7.8958    702       652\n",
       "533      188  335  21.0      0      1  21.0000    702       652\n",
       "459      188  581  42.0      1      0  27.0000    702       652\n",
       "1150     498  581   NaN      0      0  14.5000    702       652\n",
       "393      188  581  25.0      0      0  31.5000    702       652"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transform the data: see the change in the head view\n",
    "\n",
    "train_t = count_enc.transform(X_train)\n",
    "test_t = count_enc.transform(X_test)\n",
    "\n",
    "test_t.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Note\n",
    "if there are labels in the test set that were not present in the train set, the transformer will introduce NaN, and raise a warning."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
